library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0          ✔ purrr   0.2.4     
## ✔ tibble  1.4.2          ✔ dplyr   0.7.5.9000
## ✔ tidyr   0.8.0          ✔ stringr 1.3.0     
## ✔ readr   1.1.1          ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

Matrices and their Motivation

It is current practice to measure many variables on the same patients, we may have all the biometrical characteristics, height, weight, BMI, age as well as clinical variables such as blood pressure, blood sugar, heart rate for 100 patients, these variables will not be independent.

What are the data?

  • To start off, a useful toy example we’ll use is from the sports world; performances of decathlon athletes.

    These are measurements of athletes’ performances in the decathlon: the variables m100, m400, m1500 are performance times in seconds for the 100 metres, 400 metres and 1500 meters respectively, ‘m110‘ is the time taken to finish the 110 meters hurdles whereas pole is the pole-jump height, weight is the length in metres the athletes threw the weight.

    m100 long weight highj m400 m110 disc pole jave m1500
    1 11.25 7.43 15.48 2.27 48.90 15.13 49.28 4.70 61.32 268.95
    2 10.87 7.45 14.97 1.97 47.71 14.46 44.36 5.10 61.76 273.02
    3 11.18 7.44 14.20 1.97 48.29 14.81 43.66 5.20 64.16 263.20
    4 10.62 7.38 15.02 2.03 49.06 14.72 44.80 4.90 64.04 285.11

Car data

head(mtcars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

Diabetes

  • Clinical measurements (‘diabetes‘ data). This data measures glucose levels in the blood after fasting (glufast), after a test condition (glutest) as well as steady state plasma glucose (steady) and steady state (insulin) for diabetes, the sixth variable is not continuous and is considered a supplementary variable as we will see.
diabetes=read.table(url("http://bios221.stanford.edu/data/diabetes.txt"),
                    header=TRUE,row.names=1)
diabetes[1:4,]
##   relwt glufast glutest steady insulin Group
## 1  0.81      80     356    124      55     3
## 3  0.94     105     319    143     105     3
## 5  1.00      90     323    240     143     3
## 7  0.91     100     350    221     119     3

Microbial Ecology

  • Operational Taxon Unit read counts in a microbial ecology study; the columns represent different ‘species’ of bacteria, the rows are labeled for the samples.
data("GlobalPatterns", package = "phyloseq")
GPOTUs = as.matrix(t(phyloseq::otu_table(GlobalPatterns)))
GPOTUs[1:4, 6:13]
## OTU Table:          [4 taxa and 8 samples]
##                      taxa are rows
##         246140 143239 244960 255340 144887 141782 215972 31759
## CL3          0      7      0    153      3      9      0     0
## CC1          0      1      0    194      5     35      3     1
## SV1          0      0      0      0      0      0      0     0
## M31Fcsw      0      0      0      0      0      0      0     0

RNA-Seq

  • Here are some RNA-seq transcriptomic data showing numbers of mRNA reads present for different patient samples, the rows are patients and the columns are the genes.

               FBgn0000017 FBgn0000018 FBgn0000022 FBgn0000024 FBgn0000028 FBgn0000032
    untreated1        4664         583           0          10           0        1446
    untreated2        8714         761           1          11           1        1713
    untreated4        3150         310           0           3           0         672
    treated1          6205         722           0          10           0        1698
    treated3          3334         308           0           5           1         757

Mass Spectroscopy

  • Mass spectroscopy data where we have samples containing informative labels (knockout versus wildtype mice) and protein \(\times\) features designated by their m/z number.

    mz       129.9816   72.08144  151.6255  142.0349  169.0413    186.0355
    KOGCHUM1  60515      181495          0    196526    25500    51504.40
    WTGCHUM1 252579       54697        412    487800    48775    130491.15
    WTGCHUM2 187859       56318      46425   454226    45626    100845.01

Expression Data (microarray)

  • Here the rows are samples from different subjects and different T cell types and the columns are a subset of gene expression measurements on the 156 most differentially expressed genes (Holmes2005memory).
#######Melanoma/Tcell Data: Peter Lee, Susan Holmes, PNAS.
load(url("http://bios221.stanford.edu/data/Msig3transp.RData"))
round(Msig3transp,2)[1:5,1:6]
##              X3968 X14831 X13492 X5108 X16348  X585
## HEA26_EFFE_1 -2.61  -1.19  -0.06 -0.15   0.52 -0.02
## HEA26_MEM_1  -2.26  -0.47   0.28  0.54  -0.37  0.11
## HEA26_NAI_1  -0.27   0.82   0.81  0.72  -0.90  0.75
## MEL36_EFFE_1 -2.24  -1.08  -0.24 -0.18   0.64  0.01
## MEL36_MEM_1  -2.68  -0.15   0.25  0.95  -0.20  0.17
celltypes=factor(substr(rownames(Msig3transp),7,9))
status=factor(substr(rownames(Msig3transp),1,3))

The voting data

#house=read.table("/Users/susan/Dropbox/CaseStudies/votes.txt")
#head(house[,1:5])
#party=scan("/Users/susan/Dropbox/CaseStudies/party.txt")
#table(party)
library(dplyr)
library(GGally)
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
## 
##     nasa
my_mtcars <- select(mtcars, mpg, wt, qsec, cyl, gear, am) %>%
  mutate(cyl = factor(cyl), gear = factor(gear), am = factor(am)) 
ggpairs(my_mtcars, aes(color = cyl))
library(dplyr)
library(GGally)
my_mtcars <- select(mtcars, mpg, wt, qsec) 
ggpairs(my_mtcars)
my_mtcars <- select(mtcars, mpg, wt, qsec, hp) 
apply(my_mtcars, 2, mean)
##       mpg        wt      qsec        hp 
##  20.09062   3.21725  17.84875 146.68750
apply(my_mtcars, 2, sd)
##        mpg         wt       qsec         hp 
##  6.0269481  0.9784574  1.7869432 68.5628685
my_mtcars_centered = scale(my_mtcars, center = TRUE, scale = FALSE)
apply(my_mtcars_centered, 2, mean)
##          mpg           wt         qsec           hp 
## 4.440892e-16 3.469447e-17 9.436896e-16 0.000000e+00
apply(my_mtcars_centered, 2, sd)
##        mpg         wt       qsec         hp 
##  6.0269481  0.9784574  1.7869432 68.5628685
my_mtcars_centered_scaled = scale(my_mtcars)
apply(my_mtcars_centered_scaled, 2, mean)
##          mpg           wt         qsec           hp 
## 7.112366e-17 4.681043e-17 5.299580e-16 1.040834e-17
apply(my_mtcars_centered_scaled, 2, sd)
##  mpg   wt qsec   hp 
##    1    1    1    1
my_mtcars_centered_scaled <- as.data.frame(my_mtcars_centered_scaled) 
my_mtcars_centered_scaled$cyl <- factor(mtcars$cyl)
ggplot(my_mtcars_centered_scaled,
       aes(x = mpg, y = wt, col = cyl)) +
  geom_point(size = 3) +
  theme(text = element_text(size = 30))

ggplot(mtcars,
       aes(x = mpg, y = wt, col = factor(cyl))) +
  geom_point(size = 3) +
  theme(text = element_text(size = 30)) +
  scale_color_discrete(name = "cyl")

mtcars PCA

library(ade4)
library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
my_mtcars <- select(mtcars, mpg, wt, disp, hp, qsec, drat)
pca_mtcars = dudi.pca(my_mtcars, scannf = FALSE, nf = 3)
fviz_eig(pca_mtcars, geom = "bar", bar_width = 0.7) + ggtitle("")

fviz_pca_ind(pca_mtcars, repel = FALSE) +
  coord_fixed() + 
  ylim(-3.5, 3.5) + xlim(-5, 5) +
  theme(text = element_text(size = 15))

fviz_pca_ind(pca_mtcars,
             col.ind = factor(mtcars$am), 
             repel = TRUE) + # TRUE to avoid text overlapping (slow if many points)
  coord_fixed() +
  ylim(-3.5, 3.5) + xlim(-5, 5) +
  theme(text = element_text(size = 15))

fviz_pca_biplot(pca_mtcars, repel = TRUE, arrowsize = 1) +
  coord_fixed() +
  ylim(-3.5, 3.5) + xlim(-5, 5) +
  theme(text = element_text(size = 15))

fviz_pca_biplot(pca_mtcars, 
                geom = "point",
                arrowsize = 1,
                pointsize = 2) +
  coord_fixed() +
  ylim(-3.5, 3.5) + xlim(-5, 5) +
  theme(text = element_text(size = 15))

fviz_pca_biplot(pca_mtcars, 
                col.ind = factor(mtcars$cyl),
                palette = c("#00AFBB", "#E7B800", "#FC4E07"),
                geom = "point",
                arrowsize = 1,pointsize = 2, 
                addEllipses = TRUE, 
                ellipse.level = 0.6,
#                ellipse.type = "confidence",
                legend.title = "No. cylinders") +
  coord_fixed() +
  ylim(-3.5, 3.5) + xlim(-5, 5) +
  theme(text = element_text(size = 15))

Correlation Circle

cor(as.matrix(my_mtcars)) %>% round(1)
##       mpg   wt disp   hp qsec drat
## mpg   1.0 -0.9 -0.8 -0.8  0.4  0.7
## wt   -0.9  1.0  0.9  0.7 -0.2 -0.7
## disp -0.8  0.9  1.0  0.8 -0.4 -0.7
## hp   -0.8  0.7  0.8  1.0 -0.7 -0.4
## qsec  0.4 -0.2 -0.4 -0.7  1.0  0.1
## drat  0.7 -0.7 -0.7 -0.4  0.1  1.0
fviz_pca_var(pca_mtcars) 

# Color by cos2 values: quality on the factor map
fviz_pca_var(pca_mtcars, col.var = "cos2",
             gradient.cols = viridis::viridis(100)[1:95], 
             repel = TRUE # Avoid text overlapping
             )

Mass Spec Data PCA

library(ade4)
library(factoextra)

load("mat1xcms.RData")
pcamat1 = dudi.pca(t(mat1), scannf = FALSE, nf = 3)
fviz_eig(pcamat1, geom = "bar", bar_width = 0.7) + ggtitle("")

dfmat1 = cbind(pcamat1$li, tibble(
    label = rownames(pcamat1$li),
    number = substr(label, 3, 4),
    type = factor(substr(label, 1, 2))))
pcsplot = ggplot(dfmat1, 
  aes(x=Axis1, y=Axis2, label=label, group=number, colour=type)) +
  geom_text(size = 4, vjust = -0.5) + geom_point(size = 3) 
pcsplot + geom_hline(yintercept = 0, linetype = 2) +
  geom_vline(xintercept = 0, linetype = 2) +
  coord_fixed()

fviz_pca_ind(pcamat1) + ggtitle("")

fviz_pca_var(pcamat1, col.circle="black") + ggtitle("")

Weighted PCA

data("x", package = "Hiiragi2013")
xwt = x[, x$genotype == "WT"]
sel = order(genefilter::rowVars(Biobase::exprs(xwt)), decreasing = TRUE)[1:100]
xwt = xwt[sel, ]
tab = table(xwt$sampleGroup)
tab
## 
##      E3.25 E3.5 (EPI)  E3.5 (PE) E4.5 (EPI)  E4.5 (PE) 
##         36         11         11          4          4
pcaMouse = dudi.pca(as.data.frame(t(Biobase::exprs(xwt))),
                    center = TRUE, scale = TRUE, nf = 2, 
                    scannf = FALSE)
fviz_eig(pcaMouse)

xwt$weight = 1 / as.numeric(tab[xwt$sampleGroup])
pcaMouse_wt = dudi.pca(as.data.frame(t(Biobase::exprs(xwt))),
                    row.w = xwt$weight,
                    center = TRUE, scale = TRUE, nf = 2, 
                    scannf = FALSE)
fviz_eig(pcaMouse_wt)

fviz_pca_ind(pcaMouse, geom = "point", col.ind = xwt$sampleGroup) +
  coord_fixed() 

fviz_pca_ind(pcaMouse_wt, geom = "point", col.ind = xwt$sampleGroup) +
  coord_fixed() 

Wine example

library("pheatmap")
load("wine.RData")
load("wineClass.RData")
head(wine)
##   Alcohol MalicAcid  Ash AlcAsh  Mg Phenols Flav NonFlavPhenols Proa Color
## 1   14.23      1.71 2.43   15.6 127    2.80 3.06           0.28 2.29  5.64
## 2   13.20      1.78 2.14   11.2 100    2.65 2.76           0.26 1.28  4.38
## 3   13.16      2.36 2.67   18.6 101    2.80 3.24           0.30 2.81  5.68
## 4   14.37      1.95 2.50   16.8 113    3.85 3.49           0.24 2.18  7.80
## 5   13.24      2.59 2.87   21.0 118    2.80 2.69           0.39 1.82  4.32
## 6   14.20      1.76 2.45   15.2 112    3.27 3.39           0.34 1.97  6.75
##    Hue   OD Proline
## 1 1.04 3.92    1065
## 2 1.05 3.40    1050
## 3 1.03 3.17    1185
## 4 0.86 3.45    1480
## 5 1.04 2.93     735
## 6 1.05 2.85    1450
load("wine.RData")
apply(wine, 2, mean)
##        Alcohol      MalicAcid            Ash         AlcAsh             Mg 
##     13.0006180      2.3363483      2.3665169     19.4949438     99.7415730 
##        Phenols           Flav NonFlavPhenols           Proa          Color 
##      2.2951124      2.0292697      0.3618539      1.5908989      5.0580899 
##            Hue             OD        Proline 
##      0.9574494      2.6116854    746.8932584
apply(wine, 2, sd)
##        Alcohol      MalicAcid            Ash         AlcAsh             Mg 
##      0.8118265      1.1171461      0.2743440      3.3395638     14.2824835 
##        Phenols           Flav NonFlavPhenols           Proa          Color 
##      0.6258510      0.9988587      0.1244533      0.5723589      2.3182859 
##            Hue             OD        Proline 
##      0.2285716      0.7099904    314.9074743
wine_scaled = scale(wine)
apply(wine_scaled, 2, mean)
##        Alcohol      MalicAcid            Ash         AlcAsh             Mg 
##  -8.591766e-16  -6.776446e-17   8.045176e-16  -7.720494e-17  -4.073935e-17 
##        Phenols           Flav NonFlavPhenols           Proa          Color 
##  -1.395560e-17   6.958263e-17  -1.042186e-16  -1.221369e-16   3.649376e-17 
##            Hue             OD        Proline 
##   2.093741e-16   3.003459e-16  -1.034429e-16
apply(wine_scaled, 2, sd)
##        Alcohol      MalicAcid            Ash         AlcAsh             Mg 
##              1              1              1              1              1 
##        Phenols           Flav NonFlavPhenols           Proa          Color 
##              1              1              1              1              1 
##            Hue             OD        Proline 
##              1              1              1
library(pheatmap)
pheatmap(1 - cor(wine))

wine_scaled <- as.data.frame(wine_scaled)
GGally::ggpairs(wine_scaled)

library(ade4)
winePCA = dudi.pca(wine, nf = 5, scale = TRUE, center = TRUE, scannf=FALSE)
class(winePCA)
## [1] "pca"  "dudi"
names(winePCA)
##  [1] "tab"  "cw"   "lw"   "eig"  "rank" "nf"   "c1"   "li"   "co"   "l1"  
## [11] "call" "cent" "norm"
library(factoextra)
fviz_eig(winePCA)

#eig_ratio <- winePCA$eig[2]/winePCA$eig[1]
fviz_pca_ind(winePCA) + 
  coord_fixed() 

load("wineClass.RData")
table(wine.class)
## wine.class
##     barolo grignolino    barbera 
##         59         71         48
fviz_pca_ind(winePCA, col.ind = wine.class,
  palette = c("#E41A1C", "#377EB8", "#4DAF4A")) + 
  coord_fixed() 

fviz_pca_ind(winePCA, col.ind = wine.class,
  palette = c("#E41A1C", "#377EB8", "#4DAF4A"),
  addEllipses = TRUE, ellipse.level = 0.9) + 
  coord_fixed() 

fviz_pca_ind(winePCA, axes = 2:3, col.ind = wine.class,
  palette = c("#E41A1C", "#377EB8", "#4DAF4A")) + 
  coord_fixed() 

fviz_pca_var(winePCA) 

fviz_contrib(winePCA, choice = "var", axes = 1)

fviz_contrib(winePCA, choice = "var", axes = 2)

fviz_contrib(winePCA, choice = "var", axes = 1:2)

fviz_pca_biplot(
  winePCA, geom = "point", 
  col.ind = wine.class, 
  col.var = "#c07d44", 
  addEllipses = TRUE, ellipse.level = 0.7) +
  coord_fixed() +
  scale_color_brewer(palette = "Set1")

sessionInfo()
## R version 3.5.0 (2018-04-23)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] pheatmap_1.0.8   factoextra_1.0.5 GGally_1.4.0     ade4_1.7-11     
##  [5] forcats_0.3.0    stringr_1.3.0    dplyr_0.7.5.9000 purrr_0.2.4     
##  [9] readr_1.1.1      tidyr_0.8.0      tibble_1.4.2     ggplot2_3.0.0   
## [13] tidyverse_1.2.1 
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-137         bitops_1.0-6         phyloseq_1.23.1     
##  [4] bit64_0.9-7          lubridate_1.7.4      RColorBrewer_1.1-2  
##  [7] httr_1.3.1           rprojroot_1.3-2      tools_3.5.0         
## [10] backports_1.1.2      R6_2.2.2             vegan_2.5-2         
## [13] DBI_1.0.0            lazyeval_0.2.1       BiocGenerics_0.26.0 
## [16] mgcv_1.8-23          colorspace_1.3-2     permute_0.9-4       
## [19] withr_2.1.2          tidyselect_0.2.4     gridExtra_2.3       
## [22] mnormt_1.5-5         bit_1.1-12           compiler_3.5.0      
## [25] cli_1.0.0            rvest_0.3.2          Biobase_2.40.0      
## [28] xml2_1.2.0           labeling_0.3         scales_0.5.0        
## [31] psych_1.8.3.3        genefilter_1.62.0    digest_0.6.15       
## [34] foreign_0.8-70       rmarkdown_1.9        XVector_0.20.0      
## [37] pkgconfig_2.0.1      htmltools_0.3.6      rlang_0.2.0.9001    
## [40] readxl_1.1.0         RSQLite_2.1.1        rstudioapi_0.7      
## [43] bindr_0.1.1          jsonlite_1.5         RCurl_1.95-4.10     
## [46] magrittr_1.5         biomformat_1.8.0     Matrix_1.2-14       
## [49] Rcpp_0.12.17         munsell_0.4.3        S4Vectors_0.18.3    
## [52] Rhdf5lib_1.2.1       ape_5.1              viridis_0.5.1       
## [55] stringi_1.1.7        yaml_2.1.18          MASS_7.3-49         
## [58] zlibbioc_1.26.0      rhdf5_2.24.0         plyr_1.8.4          
## [61] blob_1.1.1           grid_3.5.0           parallel_3.5.0      
## [64] ggrepel_0.7.0        crayon_1.3.4         lattice_0.20-35     
## [67] Biostrings_2.48.0    haven_1.1.1          splines_3.5.0       
## [70] annotate_1.58.0      multtest_2.36.0      hms_0.4.2           
## [73] knitr_1.20           pillar_1.2.2         igraph_1.2.1        
## [76] ggpubr_0.1.6         reshape2_1.4.3       codetools_0.2-15    
## [79] stats4_3.5.0         XML_3.98-1.11        glue_1.2.0          
## [82] evaluate_0.10.1      data.table_1.11.4    modelr_0.1.1        
## [85] foreach_1.4.4        cellranger_1.1.0     gtable_0.2.0        
## [88] reshape_0.8.7        assertthat_0.2.0     xtable_1.8-2        
## [91] broom_0.4.4          viridisLite_0.3.0    survival_2.42-3     
## [94] iterators_1.0.9      memoise_1.1.0        AnnotationDbi_1.42.1
## [97] IRanges_2.14.10      bindrcpp_0.2.2       cluster_2.0.7-1